library(tidyverse)Clustering
Load data
dir.short <- "data/individual_book_train"
all.files.short <- list.files(dir.short)
raw_df_cl = data.frame(name = character(),
time_id = numeric(),
mean_BAS = numeric(),
mean_WAP = numeric(),
imbalance = numeric(),
stringsAsFactors = FALSE)
for (i in all.files.short) {
stock = read.csv(file.path(dir.short, i))
# randomly select time_ids from stock
time_ids = sample(unique(stock$time_id), 10)
file_name = gsub("stock_", "", (gsub(".csv", "", i)))
stock = stock |>
filter(time_id %in% time_ids) |>
mutate(WAP = (bid_price1 * ask_size1 + ask_price1 * bid_size1) / (bid_size1 + ask_size1)) |>
mutate(BidAskSpread = ask_price1 / bid_price1 - 1) |>
mutate(imbalance = (bid_size1 - ask_size1) / (bid_size1 + ask_size1)) |>
group_by(time_id) |>
summarise(mean_BAS = mean(BidAskSpread),
mean_WAP = mean(WAP),
imbalance = mean(imbalance)) |>
mutate(file_name = file_name)
raw_df_cl = rbind(raw_df_cl, stock)
print(file_name)
}[1] "0"
[1] "1"
[1] "10"
[1] "100"
[1] "101"
[1] "102"
[1] "103"
[1] "104"
[1] "105"
[1] "107"
[1] "108"
[1] "109"
[1] "11"
[1] "110"
[1] "111"
[1] "112"
[1] "113"
[1] "114"
[1] "115"
[1] "116"
[1] "118"
[1] "119"
[1] "120"
[1] "122"
[1] "123"
[1] "124"
[1] "125"
[1] "126"
[1] "13"
[1] "14"
[1] "15"
[1] "16"
[1] "17"
[1] "18"
[1] "19"
[1] "2"
[1] "20"
[1] "21"
[1] "22"
[1] "23"
[1] "26"
[1] "27"
[1] "28"
[1] "29"
[1] "3"
[1] "30"
[1] "31"
[1] "32"
[1] "33"
[1] "34"
[1] "35"
[1] "36"
[1] "37"
[1] "38"
[1] "39"
[1] "4"
[1] "40"
[1] "41"
[1] "42"
[1] "43"
[1] "44"
[1] "46"
[1] "47"
[1] "48"
[1] "5"
[1] "50"
[1] "51"
[1] "52"
[1] "53"
[1] "55"
[1] "56"
[1] "58"
[1] "59"
[1] "6"
[1] "60"
[1] "61"
[1] "62"
[1] "63"
[1] "64"
[1] "66"
[1] "67"
[1] "68"
[1] "69"
[1] "7"
[1] "70"
[1] "72"
[1] "73"
[1] "74"
[1] "75"
[1] "76"
[1] "77"
[1] "78"
[1] "8"
[1] "80"
[1] "81"
[1] "82"
[1] "83"
[1] "84"
[1] "85"
[1] "86"
[1] "87"
[1] "88"
[1] "89"
[1] "9"
[1] "90"
[1] "93"
[1] "94"
[1] "95"
[1] "96"
[1] "97"
[1] "98"
[1] "99"
Apply clustering
library(caret)
df_cl = raw_df_cl
df_cl$mean_BAS <- as.vector(scale(df_cl$mean_BAS, center = min(df_cl$mean_BAS), scale = max(df_cl$mean_BAS) - min(df_cl$mean_BAS)))
df_cl$mean_WAP = as.vector(scale(df_cl$mean_WAP, center = min(df_cl$mean_WAP), scale = max(df_cl$mean_WAP) - min(df_cl$mean_WAP)))
df_cl$imbalance = as.vector(scale(df_cl$imbalance, center = min(df_cl$imbalance), scale = max(df_cl$imbalance) - min(df_cl$imbalance)))
df_cl$file_name = as.numeric(df_cl$file_name)
df_cl = df_cl[order(df_cl$file_name),]
df_cl$name = paste(df_cl$file_name, df_cl$time_id, sep = " ")
df_cl = df_cl |>
filter(file_name != 31) |>
select(-time_id, -file_name) |>
select(-mean_WAP)
df_cl <- df_cl[c("name", "mean_BAS", "imbalance")]Find optimal k - skree plot
# Initialize total within sum of squares error: wss
wss <- 0
# For 1 to 15 cluster centers
for (i in 1:15) {
km.out <- kmeans(df_cl[-1], centers = i, nstart = 20)
# Save total within sum of squares to wss variable
wss[i] <- km.out$tot.withinss
}
# Plot total within sum of squares vs. number of clusters
plot(1:15, wss, type = "b",
xlab = "Number of Clusters",
ylab = "Within groups sum of squares")# Set k equal to the number of clusters corresponding to the elbow location
k <- 4Cluster with k = 4
km.out <- kmeans(df_cl[-1], centers = k, nstart = 20)
df = data.frame(
names = df_cl$name,
mean_BAS = df_cl$mean_BAS,
imbalance = df_cl$imbalance,
cluster = factor(km.out$cluster)
)
plot = ggplot(df, aes(x = mean_BAS, y = imbalance, color = cluster, label = names)) +
geom_point() +
geom_text(aes(label=names), vjust = -1, hjust = 1) +
theme_minimal() +
labs(title = "Cluster Plot", x = "mean_BAS", y = "imbalance")
library(plotly)
ggplotly(plot)